https://www.datacamp.com/courses/machine-learning-in-the-tidyverse
# install.packages('dslabs')
library(dslabs)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Эта функция создает новый df, где колонка data содержит все данные для переменной по которой группируем.
# Explore gapminder
head(gapminder)
## country year infant_mortality life_expectancy fertility
## 1 Albania 1960 115.40 62.87 6.19
## 2 Algeria 1960 148.20 47.50 7.65
## 3 Angola 1960 208.00 35.98 7.32
## 4 Antigua and Barbuda 1960 NA 62.97 4.43
## 5 Argentina 1960 59.87 65.39 3.11
## 6 Armenia 1960 NA 66.86 4.55
## population gdp continent region
## 1 1636054 NA Europe Southern Europe
## 2 11124892 13828152297 Africa Northern Africa
## 3 5270844 NA Africa Middle Africa
## 4 54681 NA Americas Caribbean
## 5 20619075 108322326649 Americas South America
## 6 1867396 NA Asia Western Asia
# Prepare the nested dataframe gap_nested
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ readr 1.3.1
## ✔ tibble 2.1.3 ✔ purrr 0.3.2
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ ggplot2 3.2.1 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
gap_nested <- gapminder %>%
group_by(country) %>%
nest()
# Explore gap_nested
head(gap_nested)
## # A tibble: 6 x 2
## # Groups: country [185]
## country data
## <fct> <list<df[,8]>>
## 1 Albania [57 × 8]
## 2 Algeria [57 × 8]
## 3 Angola [57 × 8]
## 4 Antigua and Barbuda [57 × 8]
## 5 Argentina [57 × 8]
## 6 Armenia [57 × 8]
C помощью функции unnest() можно вернуться к изначальному df Фнукция identical() сравнивает 2 объекта
# Create the unnested dataframe called gap_unnnested
gap_unnested <- gap_nested %>%
unnest(data)
# Confirm that your data was not modified
identical(gap_unnested, gapminder)
## [1] FALSE
К колонке data можно обращаться следующим образом То есть вытаскивать df и работать с ним как с обычным
# Extract the data of Algeria
algeria_df <- gap_nested$data[[1]]
# Calculate the minimum of the population vector
min(algeria_df$population)
## [1] NA
# Calculate the maximum of the population vector
max(algeria_df$population)
## [1] NA
# Calculate the mean of the population vector
mean(algeria_df$population)
## [1] NA
Можно высчитывать не только среднее для одного df, а сразу для всех с помощью map
# Calculate the mean population for each country
pop_nested <- gap_nested %>%
mutate(mean_pop = map(data, ~mean(.x$population)))
# Take a look at pop_nested
head(pop_nested)
## # A tibble: 6 x 3
## # Groups: country [185]
## country data mean_pop
## <fct> <list<df[,8]>> <list>
## 1 Albania [57 × 8] <dbl [1]>
## 2 Algeria [57 × 8] <dbl [1]>
## 3 Angola [57 × 8] <dbl [1]>
## 4 Antigua and Barbuda [57 × 8] <dbl [1]>
## 5 Argentina [57 × 8] <dbl [1]>
## 6 Armenia [57 × 8] <dbl [1]>
# Extract the mean_pop value by using unnest
pop_mean <- pop_nested %>%
unnest(mean_pop)
# Take a look at pop_mean
head(pop_mean)
## # A tibble: 6 x 3
## # Groups: country [185]
## country data mean_pop
## <fct> <list<df[,8]>> <dbl>
## 1 Albania [57 × 8] NA
## 2 Algeria [57 × 8] NA
## 3 Angola [57 × 8] NA
## 4 Antigua and Barbuda [57 × 8] NA
## 5 Argentina [57 × 8] NA
## 6 Armenia [57 × 8] NA
map возращает лись, поэтому приходится применять unnest() Чтобы избежать этого, можно сразу использовать семейство функций map_*(map_dbl)
# Calculate mean population and store result as a double
pop_mean <- gap_nested %>%
mutate(mean_pop = map_dbl(data, ~mean(.x$population)))
# Take a look at pop_mean
head(pop_mean)
## # A tibble: 6 x 3
## # Groups: country [185]
## country data mean_pop
## <fct> <list<df[,8]>> <dbl>
## 1 Albania [57 × 8] NA
## 2 Algeria [57 × 8] NA
## 3 Angola [57 × 8] NA
## 4 Antigua and Barbuda [57 × 8] NA
## 5 Argentina [57 × 8] NA
## 6 Armenia [57 × 8] NA
С помощью map можно засовывать результаты линейной регрессии
# Build a linear model for each country
gap_models <- gap_nested %>%
mutate(model = map(data, ~lm(formula = life_expectancy~year, data = .x)))
# Extract the model for Algeria
algeria_model <- gap_models$model[[1]]
# View the summary for the Algeria model
summary(algeria_model)
##
## Call:
## lm(formula = life_expectancy ~ year, data = .x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7990 -0.3172 -0.1635 0.5574 1.1488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.974e+02 1.236e+01 -32.14 <2e-16 ***
## year 2.362e-01 6.219e-03 37.99 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7724 on 55 degrees of freedom
## Multiple R-squared: 0.9633, Adjusted R-squared: 0.9626
## F-statistic: 1443 on 1 and 55 DF, p-value: < 2.2e-16
есть три основные функции
Информация о коэффициентах
library(broom)
# Extract the coefficients of the algeria_model as a dataframe
tidy(algeria_model)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -397. 12.4 -32.1 2.48e-37
## 2 year 0.236 0.00622 38.0 3.72e-41
Информация о статистиках(R^2)
# Extract the statistics of the algeria_model as a dataframe
glance(algeria_model)
## # A tibble: 1 x 11
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.963 0.963 0.772 1443. 3.72e-41 2 -65.1 136. 142.
## # … with 2 more variables: deviance <dbl>, df.residual <int>
Информация о реальных значениях, о предсказанных об ошибках
algeria_fitted <- augment(algeria_model)
График линии модели и данных
library(ggplot2)
algeria_fitted %>%
ggplot(mapping = aes(x = year)) +
geom_point(mapping = aes(y = life_expectancy)) +
geom_line(mapping = aes(y = .fitted), color = "red")
Можно применить функцию tidy для всех стран и записать их в табличку
# Extract the coefficient statistics of each model into nested dataframes
model_coef_nested <- gap_models %>%
mutate(coef = map(model, ~tidy(.x)))
# Simplify the coef dataframes for each model
model_coef <- model_coef_nested %>%
unnest(coef)
После чего можно работать с этой табличкой Например нарисовать гистограмму распределения коэффициента при переменной year
model_coef %>%
filter(term == "year") %>%
ggplot(aes(x = estimate)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Получим статистики по всем странам
# Extract the fit statistics of each model into dataframes
model_perf_nested <- gap_models %>%
mutate(fit = map(model, ~glance(.x)))
# Simplify the fit dataframes for each model
model_perf <- model_perf_nested %>%
unnest(fit)
# Look at the first six rows of model_perf
head(model_perf)
## # A tibble: 6 x 14
## # Groups: country [185]
## country data model r.squared adj.r.squared sigma statistic p.value
## <fct> <list<d> <lis> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Albania [57 × 8] <lm> 0.963 0.963 0.772 1443. 3.72e-41
## 2 Algeria [57 × 8] <lm> 0.938 0.937 2.53 828. 7.84e-35
## 3 Angola [57 × 8] <lm> 0.989 0.989 0.698 5091. 6.69e-56
## 4 Antigu… [57 × 8] <lm> 0.938 0.937 0.977 828. 7.64e-35
## 5 Argent… [57 × 8] <lm> 0.983 0.982 0.479 3103. 4.58e-50
## 6 Armenia [57 × 8] <lm> 0.288 0.275 1.57 22.2 1.70e- 5
## # … with 6 more variables: df <int>, logLik <dbl>, AIC <dbl>, BIC <dbl>,
## # deviance <dbl>, df.residual <int>
Потом можно уже работать с этой табличкой
Построить гистограмму R2
model_perf %>%
ggplot(aes(x = r.squared)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Выбрать топ-4 лудших и худших моделей
# Extract the 4 best fitting models
best_fit <- model_perf %>%
top_n(n = 4, wt = r.squared)
# Extract the 4 models with the worst fit
worst_fit <- model_perf %>%
top_n(n = 4, wt = -r.squared)
Это дает возможность легко рисовать графики для всех стран
сделаем таблички для лучших и худших стран
best_augmented <- best_fit %>%
# Build the augmented dataframe for each country model
mutate(augmented = map(.x = model, ~augment(.x))) %>%
# Expand the augmented dataframes
unnest(augmented)
worst_augmented <- worst_fit %>%
# Build the augmented dataframe for each country model
mutate(augmented = map(.x = model, ~augment(.x))) %>%
# Expand the augmented dataframes
unnest(augmented)
Графики для лучших
# Compare the predicted values with the actual values of life expectancy
# for the top 4 best fitting models
best_augmented %>%
ggplot(aes(x = year)) +
geom_point(aes(y = life_expectancy)) +
geom_line(aes(y = .fitted), color = "red") +
facet_wrap(~country, scales = "free_y")
Графики для худших
# Compare the predicted values with the actual values of life expectancy
# for the top 4 worst fitting models
worst_augmented %>%
ggplot(aes(x = year)) +
geom_point(aes(y = life_expectancy)) +
geom_line(aes(y = .fitted), color = "red") +
facet_wrap(~country, scales = "free_y")
Сделаем не парную а мультирегерссию
# Build a linear model for each country using all features
gap_fullmodel <- gap_nested %>%
mutate(model = map(data, ~lm(formula = life_expectancy ~ ., data = .x)))
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]): contrasts can be applied only to factors with 2 or more levels
fullmodel_perf <- gap_fullmodel %>%
# Extract the fit statistics of each model into dataframes
mutate(fit = map(model, ~glance(.x))) %>%
# Simplify the fit dataframes for each model
unnest(fit)
## Error in eval(lhs, parent, parent): объект 'gap_fullmodel' не найден
# View the performance for the four countries with the worst fitting
# four simple models you looked at before
fullmodel_perf %>%
filter(country %in% worst_fit$country) %>%
select(country, adj.r.squared)
## Error in eval(lhs, parent, parent): объект 'fullmodel_perf' не найден
Все функции для деления в пакете rsample
# install.packages('rsample')
library(rsample)
Деление на train и test в пропорции 3 к 4
set.seed(42)
# Prepare the initial split object
gap_split <- initial_split(gapminder, prop = 0.75)
# Extract the training dataframe
training_data <- training(gap_split)
# Extract the testing dataframe
testing_data <- testing(gap_split)
# Calculate the dimensions of both training_data and testing_data
dim(training_data)
## [1] 7909 9
dim(testing_data)
## [1] 2636 9
Теперь используем кросс валидацию
set.seed(42)
# Prepare the dataframe containing the cross validation partitions
cv_split <- vfold_cv(training_data, v = 5)
cv_data <- cv_split %>%
mutate(
# Extract the train dataframe for each split
train = map(splits, ~training(.x)),
# Extract the validate dataframe for each split
validate = map(splits, ~testing(.x))
)
# Use head() to preview cv_data
head(cv_data)
## # A tibble: 5 x 4
## splits id train validate
## * <named list> <chr> <named list> <named list>
## 1 <split [6.3K/1.6K]> Fold1 <df[,9] [6,327 × 9]> <df[,9] [1,582 × 9]>
## 2 <split [6.3K/1.6K]> Fold2 <df[,9] [6,327 × 9]> <df[,9] [1,582 × 9]>
## 3 <split [6.3K/1.6K]> Fold3 <df[,9] [6,327 × 9]> <df[,9] [1,582 × 9]>
## 4 <split [6.3K/1.6K]> Fold4 <df[,9] [6,327 × 9]> <df[,9] [1,582 × 9]>
## 5 <split [6.3K/1.6K]> Fold5 <df[,9] [6,328 × 9]> <df[,9] [1,581 × 9]>
Обучаем модели на каждом фолде
# Build a model using the train data for each fold of the cross validation
cv_models_lm <- cv_data %>%
mutate(model = map(.x = train, ~lm(formula = life_expectancy ~ ., data = .x)))
Вытаскиваем настоящие значения и делаем прогноз
cv_prep_lm <- cv_models_lm %>%
mutate(
# Extract the recorded life expectancy for the records in the validate dataframes
validate_actual = map(validate, ~.x$life_expectancy),
# Predict life expectancy for each validate set using its corresponding model
validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y))
)
## Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels): factor country has new levels Aruba, French Polynesia, Greenland, Hong Kong, China, Macao, China, New Caledonia, Puerto Rico
разные метрики есть в пакете Metrics
# install.packages("Metrics")
library(Metrics)
считаем метрику для каждого фолда, а потом среднее
library(Metrics)
# Calculate the mean absolute error for each validate fold
cv_eval_lm <- cv_prep_lm %>%
mutate(validate_mae = map2_dbl(validate_actual, validate_predicted, ~mae(actual = .x, predicted = .y)))
## Error in eval(lhs, parent, parent): объект 'cv_prep_lm' не найден
# Print the validate_mae column
cv_eval_lm$validate_mae
## Error in eval(expr, envir, enclos): объект 'cv_eval_lm' не найден
# Calculate the mean of validate_mae column
mean(cv_eval_lm$validate_mae)
## Error in mean(cv_eval_lm$validate_mae): объект 'cv_eval_lm' не найден
находится в пакете ranger
# install.packages('ranger')
library(ranger)
Обучим и посмотрим
# Build a random forest model for each fold
cv_models_rf <- cv_data %>%
mutate(model = map(train, ~ranger(formula = life_expectancy~., data = .x,
num.trees = 100, seed = 42)))
## Error: Missing data in columns: infant_mortality, fertility, population, gdp.
# Generate predictions using the random forest model
cv_prep_rf <- cv_models_rf %>%
mutate(validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y)$predictions))
## Error in eval(lhs, parent, parent): объект 'cv_models_rf' не найден
теперь померием метрику
# Calculate validate MAE for each fold
cv_eval_rf <- cv_prep_rf %>%
mutate(validate_mae = map2_dbl(validate_actual, validate_predicted, ~mae(actual = .x, predicted = .y)))
## Error in eval(lhs, parent, parent): объект 'cv_prep_rf' не найден
# Print the validate_mae column
print(cv_eval_rf$validate_mae)
## Error in print(cv_eval_rf$validate_mae): объект 'cv_eval_rf' не найден
# Calculate the mean of validate_mae column
mean(cv_eval_rf$validate_mae)
## Error in mean(cv_eval_rf$validate_mae): объект 'cv_eval_rf' не найден
она меньше чем у LR
Тюним модель по параметру mtry
# Prepare for tuning your cross validation folds by varying mtry
cv_tune <- cv_data %>%
crossing(mtry = 2:5)
# Build a model for each fold & mtry combination
cv_model_tunerf <- cv_tune %>%
mutate(model = map2(.x = train, .y = mtry, ~ranger(formula = life_expectancy~.,
data = .x, mtry = .y,
num.trees = 100, seed = 42)))
## Error: Missing data in columns: infant_mortality, fertility, population, gdp.
Делаем предикт и Смотрим для какого параметра модель лучше
# Generate validate predictions for each model
cv_prep_tunerf <- cv_model_tunerf %>%
mutate(validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y)$predictions))
## Error in eval(lhs, parent, parent): объект 'cv_model_tunerf' не найден
# Calculate validate MAE for each fold and mtry combination
cv_eval_tunerf <- cv_prep_tunerf %>%
mutate(validate_mae = map2_dbl(.x = validate_actual, .y = validate_predicted, ~mae(actual = .x, predicted = .y)))
## Error in eval(lhs, parent, parent): объект 'cv_prep_tunerf' не найден
# Calculate the mean validate_mae for each mtry used
cv_eval_tunerf %>%
group_by(mtry) %>%
summarise(mean_mae = mean(validate_mae))
## Error in eval(lhs, parent, parent): объект 'cv_eval_tunerf' не найден
Для параметра 4!
Поняли, какая модель лучшая, поэтому обучаем ее на все train и делаем predict на test
best_model <- ranger(formula = life_expectancy~., data = training_data,
mtry = 4, num.trees = 100, seed = 42)
## Error: Missing data in columns: infant_mortality, fertility, population, gdp.
test_actual <- testing_data$life_expectancy
test_predict <- predict(best_model, testing_data)$predictions
## Error in predict(best_model, testing_data): объект 'best_model' не найден
mae(test_actual, test_predict)
## Error in ae(actual, predicted): объект 'test_predict' не найден